# Load packages
library("tidyverse")
## Warning: package 'ggplot2' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library("ggforce")
library("knitr")
library("lubridate")
library("plotly")
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library("zoo")
## Warning: package 'zoo' was built under R version 4.4.3
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library("spls")
## Sparse Partial Least Squares (SPLS) Regression and
## Classification (version 2.3-2)
library("GGally")
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library("corrplot")
## corrplot 0.95 loaded
library("plotly")
library("rgl")
library("scatterplot3d")
library("knitr")
# Read in data
cv <- read.csv("https://jmc108.github.io/hugen2073-textbook/exercises/4_amounts_and_tables/cv.csv")
lrr <- read.csv("/Users/jonathanchernus/Documents/Teaching/hugen2073-textbook/exercises/6_associations_and_trends/lrr.csv")
sc <- read.csv("/Users/jonathanchernus/Documents/Teaching/hugen2073-textbook/exercises/6_associations_and_trends/sc.csv")
pcs <- read.csv("/Users/jonathanchernus/Downloads/archive/genetic_data_train.csv")
util <- read_csv("/Users/jonathanchernus/Documents/Teaching/hugen2073-textbook/exercises/6_associations_and_trends/utility.csv")
## Rows: 8783 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): TYPE, DATE, COST, NOTES
## dbl (1): USAGE (kWh)
## time (2): START TIME, END TIME
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
The chunk below simulates genetic drift in two populations of
different sizes (Ne=200 Ne=2000).
We start with an initial allele frequency of p0=0.3 and
follow p for 20 generations (generation).
Each population is simulated 12 times, as reflected in the grouping
variable pop. So, for example, Ne200_7 in the
pop column means “replicate number 7 of the 200-person
population.”
Inspect the data to make sure you understand the variables.
set.seed(2073)
# ---- Wright–Fisher drift simulator ----
simulate_drift <- function(p0, Ne, gens, n_pops, label_prefix = "pop") {
# p0: starting allele frequency
# Ne: effective population size (diploid)
# gens: number of generations
# n_pops: number of replicate populations
map_dfr(seq_len(n_pops), function(i) {
p <- numeric(gens + 1)
p[1] <- p0
for (g in 2:(gens + 1)) {
# next generation allele count ~ Binomial(2Ne, p_prev)
k <- rbinom(1, size = 2 * Ne, prob = p[g - 1])
p[g] <- k / (2 * Ne)
}
tibble(
pop = paste0(label_prefix, i),
generation = 0:gens,
p = p
)
})
}
# ---- realistic-ish parameters ----
p0 <- 0.30 # common-ish starting allele frequency
gens <- 60
n_pops <- 12
# Two scenarios: small vs large Ne to show volatility differences
d_small <- simulate_drift(p0 = p0, Ne = 200, gens = gens, n_pops = n_pops, label_prefix = "Ne200_")
d_large <- simulate_drift(p0 = p0, Ne = 2000, gens = gens, n_pops = n_pops, label_prefix = "Ne2000_")
d <- bind_rows(
d_small %>% mutate(Ne = 200),
d_large %>% mutate(Ne = 2000)
)
kable(head(d))
| pop | generation | p | Ne |
|---|---|---|---|
| Ne200_1 | 0 | 0.3000 | 200 |
| Ne200_1 | 1 | 0.2600 | 200 |
| Ne200_1 | 2 | 0.2925 | 200 |
| Ne200_1 | 3 | 0.2600 | 200 |
| Ne200_1 | 4 | 0.3175 | 200 |
| Ne200_1 | 5 | 0.3450 | 200 |
Now make a line graph of the allele frequency over time:
facet by population size (Ne), in such a way that
one plot is on top of the other (rather than side by side)
draw a line for each replicate, but make them slightly
translucent with alpha (color them all the same; what
aesthetic should you use instead of color?)
include + theme_classic() to remove the plot
background
but add a horizontal dotted line to show the initial allele
frequency of p0=0.3 (what geom do you need? Include
linetype="dashed")
# Your code here
ggplot(d, aes(x = generation, y = p, group = pop)) +
geom_hline(yintercept = p0, linetype = "dashed") +
geom_line(alpha = 0.6) +
facet_wrap(~Ne, ncol = 1) +
coord_cartesian(ylim = c(0, 1)) +
labs(
title = "Genetic drift (Wright–Fisher) in replicate populations",
subtitle = "Same starting allele frequency, different effective population sizes",
x = "Generation",
y = "Allele frequency (p)"
) +
theme_classic()
Your instructor thought his electricity bill looked high, so he downloaded a year’s data from the utility company.
This code produces an interactive bar graph of electricity usage every hour, colored by hourly electricity cost.
No need to change anything here - just play with the interactive plot for a few moments and check out how surprisingly simple the plotting code is.
d2 <- util %>%
mutate(
datetime = as.POSIXct(as.Date(DATE, format = "%m/%d/%y"), tz = "America/New_York") +
as.numeric(`START TIME`),
cost = readr::parse_number(COST)
) %>%
arrange(datetime)
p <- plot_ly(
data = d2,
x = ~datetime,
y = ~`USAGE (kWh)`,
type = "bar",
customdata = ~cost,
hovertemplate = paste(
"%{x}<br>",
"Usage: %{y:.2f} kWh<br>",
"Cost: $%{customdata:.2f}",
"<extra></extra>"
),
marker = list(
color = ~cost, # <- continuous color mapping
showscale = TRUE, # <- colorbar legend
colorbar = list(title = "Cost ($)")
)
) %>%
layout(
title = "Hourly Usage (kWh), Colored by Cost ($)",
xaxis = list(
title = "Time",
type = "date",
rangeslider = list(visible = TRUE)
),
yaxis = list(title = "Usage (kWh)")
)
p
The dataset lrr (read in above) contains LRR values on
chromosome 22 for a single scan.
Make a scatterplot of the data. Then use the function
rollmean from the zoo package to calculate
moving average, and plot it as a connected line. Include the option
fill=NA. Vary the k parameter to see if you
can find one that does a good job of pinpointing the deletion. (Change
xlim() if that helps.)
lrr %>%
mutate(avg=rollmean(lrr, k=5, align="center", fill=NA)) %>%
ggplot() +
geom_point(aes(x=pos/10^6, y=lrr)) +
geom_line(aes(x=pos/10^6, y=avg)) +
xlab("Chr 22 (Mb)") +
xlim(c(15,25))
## Warning: <ggplot> %+% x was deprecated in ggplot2 4.0.0.
## ℹ Please use <ggplot> + x instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 732 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 734 rows containing missing values or values outside the scale range
## (`geom_line()`).
Try the same thing with a LOESS curve
(geom_smooth(method="loess")) instead of
rollmean. Turn off the error bar, and vary the
span parameter.
lrr %>%
mutate(avg=rollmean(lrr, k=5, align="center", fill=NA)) %>%
ggplot() +
geom_point(aes(x=pos/10^6, y=lrr)) +
geom_smooth(aes(x=pos/10^6, y=avg),
se=FALSE,
method="loess",
span=0.02) +
xlab("Chr 22 (Mb)")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_smooth()`).
Now, let’s try making a scatterplot of gene expression in two samples
in the sc.csv dataset (read in above). Pick two samples and
make a scatterplot of the expression of all the genes. Apply the
log2(x+1) transformation first. (Don’t use
sample1 and sample2, as they’re so similar
that they make for a boring plot.)
How many points are in the plot? How many points do you think you can see in the plot?
In your first plot, try using the alpha parameter to
control overplotting.
# Your code here
sc %>%
ggplot() +
geom_point(aes(x=log2(sample1+1),y=log2(sample5+1)),
alpha=0.1)
Now try a few more ways of addressing the overplotting problem.
First, try out geom_bin2d and geom_hex
Try filtering out genes that are unexpressed in either sample
Also try adjusting bin parameters
sc %>%
ggplot() +
geom_bin2d(aes(x=log2(sample1+1),y=log2(sample5+1)))
## `stat_bin2d()` using `bins = 30`. Pick better value `binwidth`.
sc %>%
ggplot() +
geom_hex(aes(x=log2(sample1+1),y=log2(sample5+1)), bins=30)
sc %>%
filter(sample1 > 0 & sample2 > 0) %>%
ggplot() +
geom_bin2d(aes(x=log2(sample1+1),y=log2(sample5+1)), bins=50)
sc %>%
filter(sample1 > 0 & sample2 > 0) %>%
ggplot() +
geom_hex(aes(x=log2(sample1+1),y=log2(sample5+1)), bins=50)
Now try two versions of a 2D density plot:
geom_density_2d and geom_density_2d_filled.
Notice the default plot is not helpful without filtering lowly-expressed
genes. Try putting the two geoms in a single plot and overlaying a
scatterplot.
Note there are some more variations here.
sc %>%
filter(sample1 > 0 & sample2 > 0) %>%
ggplot() +
geom_density_2d_filled(aes(x=log2(sample1+1),y=log2(sample5+1))) +
geom_density_2d(aes(x=log2(sample1+1),y=log2(sample5+1))) +
geom_point(aes(x=log2(sample1+1),y=log2(sample5+1)), alpha=0.1)
If we have two variables that change with time, we could plot a line graph of each. Here, gene-expression measurements from synchronized cells are summarized by two principal components.
data(yeast)
# genes x timepoints
dim(yeast$y)
## [1] 542 18
# transpose so rows = timepoints
pca <- prcomp(t(yeast$y), scale. = TRUE)
pca_df <- data.frame(
PC1 = pca$x[, 1],
PC2 = pca$x[, 2],
idx = seq_len(nrow(pca$x)) # implicit time index
)
pca_df %>%
ggplot() +
geom_point(aes(x=idx, y=PC1)) + geom_line(aes(x=idx, y=PC1), color="red") +
geom_point(aes(x=idx, y=PC2)) + geom_line(aes(x=idx, y=PC2), color="blue") +
ylab("PC1 (red) and PC2 (blue)")+
ggtitle("Two PCs over time")
Instead, we could look at how PC1 and PC2
relate to each other directly. Make a connected scatterplot to show
this.
Try geom_line - why doesn’t it work?
Use geom_path instead.
Try including arrow=arrow().
# Your code here
ggplot(pca_df, aes(PC1, PC2)) +
geom_path(aes(color=idx), arrow=arrow()) +
geom_point()
We can use the ggpairs function from GGally
to make pairwise plots of many variables.
Try using ggpairs to explore expression in 5 samples in
the sc dataset. log2(x_1)-transform the
variables and filter out non-expressed genes first.
samples <- c("sample1", "sample2", "sample3", "sample4", "sample5")
sc %>%
select(all_of(samples)) %>%
filter(rowSums(sc[, samples] <= 0) == 0) %>%
mutate(across(everything(), function(x) log2(x + 1))) %>%
ggpairs()
Now try using the corrplot function from the
corrplot package.
Try using the same code you used in the chunk above to filter and process the data
This time, use all the samples
Pipe it into the cor function to calculate
correlations
Pipe the correlation matrix into corrplot
Experiment with different values of the method
argument (or use the corrplot.mixed function and separately
specify lower and upper; note the default
plotting parameters are not wonderful)
sc %>%
filter(rowSums(sc[, samples] <= 0) == 0) %>%
mutate(across(everything(), function(x) log2(x + 1))) %>%
cor() %>%
corrplot.mixed(corr=., upper="ellipse", lower="number")
There are several ways to make 3D scatterplots in R. These include
plotly::plot_ly(type="scatter3d", mode="markers")
rgl::plot3d
scatterplot3d:scatterplot3d (only a 2D
projection)
Try using at least one of the above to make a scatterplot of the
first 3 PCs of ancestry in the dataset pcs. Color the
points by Ancestry. You will probably need to check the
documentation of the plotting function you use (hint: look for examples
at the bottom of the help page).
# Your code here
plot_ly(pcs, x=~PC1, y=~PC2, z=~PC3, type="scatter3d", mode="markers", color=pcs$Ancestry)
plot3d(pcs$PC1, pcs$PC2, pcs$PC3, col=as.numeric(factor(pcs$Ancestry)))
scatterplot3d(pcs$PC1, pcs$PC2, pcs$PC3, color=as.numeric(factor(pcs$Ancestry)))
Let’s make a parallel set-style plot to show how the categorical
variables Type and clinsig are related in the
cv dataset.
There are several packages for these kinds of plots, and they all
seem to be a little quirky. We will try geom_parallel_sets
from the ggforce package.
First we need to use some helper functions to get the data in shape
for geom_parallel_set.
Currently, the data look like this:
# Apply head(cv) here
We need to aggregate the cross-category counts and convert that into parallel sets format.
Copy this code and run it in the chunk below:
# Aggregate counts
d_counts <- cv %>%
count(Type, clinsig, name = "n")
# Convert to parallel-sets format
d_ps <- gather_set_data(d_counts, c("Type", "clinsig"))
head(d_counts)
head(d_ps)
# Aggregate counts
d_counts <- cv %>%
count(Type, clinsig, name = "n")
head(d_counts)
## Type clinsig n
## 1 Deletion Benign 181
## 2 Deletion Likely benign 290
## 3 Deletion Likely pathogenic 470
## 4 Deletion Pathogenic 8610
## 5 Deletion Uncertain significance 674
## 6 Duplication Benign 126
# Convert to parallel-sets format
d_ps <- gather_set_data(d_counts, c("Type", "clinsig"))
head(d_ps)
## Type clinsig n id x y
## 1 Deletion Benign 181 1 Type Deletion
## 2 Deletion Likely benign 290 2 Type Deletion
## 3 Deletion Likely pathogenic 470 3 Type Deletion
## 4 Deletion Pathogenic 8610 4 Type Deletion
## 5 Deletion Uncertain significance 674 5 Type Deletion
## 6 Duplication Benign 126 6 Type Duplication
Now we’re ready to plot
Use ggplot
with d_ps as the input data frame
and aes(x, id = id, split = y, value = n)
Also include these three geoms
geom_parallel_sets
geom_parallel_sets_axes
geom_parallel_sets_labels
To improve the plot:
adjust alpha in one of the layers (which?)
adjust size in one of the layers (which?)
include
axis.width = 0.25, color = "grey30", fill = "grey95" in one
of the layers (which?)
try coloring the ribbons by either Type or
clinsig (which layer?)
# Plot: straight-sided parallel sets
ggplot(d_ps, aes(x, id = id, split = y, value = n)) +
geom_parallel_sets(aes(fill=Type), alpha=0.7) +
geom_parallel_sets_axes(axis.width = 0.25,
color = "grey30",
fill = "grey95") +
geom_parallel_sets_labels(size = 3)